Extracting hourly temperature data from NOAA ISD (integrated surface database) weather data
ish_parser python module is from: https://github.com/haydenth/ish_parser
In [1]:
# boilerplate includes
import sys
import os
import numpy as np
import matplotlib as mpl
#mpl.use('nbagg')
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
#import mpld3 # for outputting interactive html figures
import pandas as pd
import seaborn as sns
import ish_parser
import gzip
import ftplib
import io
import dateutil
from IPython.display import display, HTML
%matplotlib notebook
plt.style.use('seaborn-notebook')
pd.set_option('display.max_columns', None)
In [2]:
# PARAMETERS
RUNSETS = [ # callsign, timezone, start_date, end_date
['KSFO','-08:00', '1950-01-01','2016-01-01'],#
['KFAT','-08:00', '1950-01-01','2016-01-01'],#
['KJAX','-05:00', '1950-01-01','2016-01-01'],#
['KBUR','-08:00', '1973-01-01','2016-01-01'],#
['KIAH','-06:00', '1970-01-01','2016-01-01'],#
['KLAX','-08:00', '1950-01-01','2016-01-01'],#
['KMCO','-05:00', '1973-01-01','2016-01-01'],#
['KRIV','-08:00', '1950-01-01','2016-01-01'],#
['KTPA','-05:00', '1950-01-01','2016-01-01'],#
['KSAN','-08:00', '1950-01-01','2016-01-01'],#
['KMIA','-05:00', '1950-01-01','2016-01-01'],#
]
TEMPERATURE_DATADIR = '../data/temperatures/ISD'
TEMPERATURE_OUTDIR = '../data/temperatures'
SUPPRESS_FIGURE_DISPLAY = True
## parameters used in fetching
USE_CACHED_STATION_H5_FILES = True # Don't download new temperature files unless required
FTPHOST = 'ftp.ncdc.noaa.gov'
FETCH_STATIONS_LIST_FILE = True
## parameters used in cleaning
TEMP_COL = 'AT' # The label of the hourly temperature column we make/output
# Resampling and interpolation parameters
# spline order used for converting to on-the-hour and filling small gaps
BASE_INTERPOLATION_K = 1 # 1 for linear interpolation
# give special treatment to data gaps longer than...
POTENTIALLY_PROBLEMATIC_GAP_SIZE = pd.Timedelta('03:00:00')
In [3]:
for STATION_CALLSIGN, LOCAL_TIME_OFFSET, START_DATE, END_DATE in RUNSETS:
SUBNOTEBOOK_FLAG = True
DATADIR = TEMPERATURE_DATADIR
OUTDIR = TEMPERATURE_OUTDIR
%run -i "Fetching and parsing ISH.ipynb"
%run -i "Cleaning temperatures.ipynb"
In [ ]:
In [4]:
for RUNSET, LOCAL_TIME_OFFSET, START_DATE, END_DATE in RUNSETS:
# convert from UTC to a *fixed offset* approximating local (solar time would be better, but this is good enough)
# Note: not just the local timezone, because daylight-savings time is a pointless complication
# convert the LOCAL_TIME_OFFSET string to number of seconds
tmp = LOCAL_TIME_OFFSET.split(':')
tmp = int(tmp[0])*3600+int(tmp[1])*60
# create a timezone offset
sitetz = dateutil.tz.tzoffset(LOCAL_TIME_OFFSET, tmp)
## Load Temperature data
tfile = os.path.join(TEMPERATURE_OUTDIR, "{}_AT_cleaned.h5".format(RUNSET))
tempdf = pd.read_hdf(tfile, 'table')
# apply the timezone conversion
tempdf.index = tempdf.index.tz_convert(sitetz)
## Save the desired date range as a CSV file
# note: the date ranges were determined by inspecting the full temperature data
# and excluding problematic early dates (big gaps, lots of outliers, ect.)
outfn = os.path.join(TEMPERATURE_OUTDIR, "{}_AT_cleaned_trimmed".format(RUNSET))
t = tempdf.loc[START_DATE:]
print("Saving {} {} to {} : {}.csv".format(RUNSET, t.index[0], t.index[-1], outfn))
t.to_csv(outfn+'.csv', index_label='datetime')
In [ ]: